%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
%matplotlib inline
df = pd.read_csv('bank-full.csv')
df.head(10)
df.shape
df.info()
df.isna().sum() #check for % null values
#getting the different variables
columns_cat = set()
columns_numeric = set()
for key in df.columns:
if df[key].dtype == 'object':
columns_cat.add(key)
else:
columns_numeric.add(key)
columns_cat = list(columns_cat)
columns_numeric = list(columns_numeric)
print( f'Numeric fields {columns_numeric}')
print( f'Categorical fields {columns_cat}')
#numeric fields
m = len(columns_numeric)
fig, axs = plt.subplots(2,4,figsize=(24,16))
i=0
for key in columns_numeric:
sns.distplot( df[key] , ax = axs[i//4][i%4] , kde=False)
i =i +1
plt.show()
##Observation
## pdays =-1 or pdays =0 needs to be analysed
#numeric field
m = len(columns_numeric)
fig, axs = plt.subplots(2,4,figsize=(24,16))
i=0
for key in columns_numeric:
sns.boxplot(y = key , x = 'Target', data = df, ax = axs[i//4][i%4])
i =i +1
plt.show()
pd.DataFrame(df.groupby('Target')[columns_numeric].describe().T.unstack())
#Answer and Observation
#For Previous a notable outlier
#For Duration a notable outlier
#For balance a notable outlier
#handling the outliers as found in the steps above
print(df[ (df.duration>4000) | (df.previous>250) | (df.balance>80000)])
# We can delete the Target = no in above query
# we have many Target = no records and these 4 are definitely outliers
# we can keep the yes records because of less number of yes records
df.drop([24148,26227,29182,39989], inplace=True)
##pdays analysis
print('\npdays =-1')
dfg = df[(df.pdays==-1)].groupby(by =['Target','poutcome'])
print(dfg['contact'].count())
print('\npdays =0')
dfg = df[(df.pdays==0)].groupby(by =['Target','poutcome'])
print(dfg['contact'].count())
print('\npdays >0')
dfg = df[(df.pdays>0)].groupby(by =['Target','poutcome'])
print(dfg['contact'].count().T)
##majority of Target = Yes had pdays = -1
#for Target = yes the pdays < 365 for those with pdays != -1
#categorical values
m = len(columns_cat)
fig, axs = plt.subplots(6,2,figsize=(12,20))
i=0
for key in columns_cat:
ax1 = axs[i//2][i%2]
sns.countplot(x = key , hue = 'Target', data = df, ax = ax1)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation = 90)
i =i +1
plt.show()
#Answer observation
# poutcome/unknown very high
# housing loan = no Target = yes more
# very few people with loan had Target - yes
# married - yes / most Target = yes
# default seems to be a very important criteria
# cellular seems to be the most common way to contact - but this can only mean that it was eaiser to contact them
# management job seems to be most Target = yes
#Categorical values
for key in columns_cat:
dfg = pd.crosstab(df[key], df.Target,margins = True)
print(dfg)
print()
#Answer and Observation
#Categorical values look ok
# we can drop job = unknown
# categorrical - checking job
print(df[(df.job=='unknown')].groupby(by=['Target','education','poutcome'])['marital'].count())
# we keep the job type unknow as 10%+ of unknow job type had target = yes
#change to categorical
for key in columns_cat:
df[key] = df[key].astype('category')
df.info()
# checking correlation
corr = df.corr()
sns.heatmap(corr, annot = True, fmt='.2g',cmap= 'coolwarm')
plt.show()
corr = df[(df.Target=='yes')].corr()
sns.heatmap(corr, annot = True, fmt='.2g',cmap= 'coolwarm')
plt.show()
corr = df[(df.Target=='yes') & (df.pdays == -1)].corr()
sns.heatmap(corr, annot = True, fmt='.2g',cmap= 'coolwarm')
plt.show()
corr = df[(df.Target=='yes') & (df.pdays >0)].corr()
sns.heatmap(corr, annot = True, fmt='.2g',cmap= 'coolwarm')
plt.show()
##previous successful ?
corr = df[(df.Target=='yes') & (df.poutcome=='success')].corr()
sns.heatmap(corr, annot = True, fmt='.2g',cmap= 'coolwarm')
plt.show()
##No real correlation
sns.pairplot(df, hue = 'Target' , diag_kind="hist")
## Analysing previous and this campaign
dfg =df.groupby(by=['Target','poutcome'])
print('Previous and CUrrent Campaign')
print( dfg['previous'].count())
print()
print('**** number of campaigns')
print( dfg['previous','campaign'].mean())
print()
print('**** number of pdays since last campaign pdays !=-1')
dfg =df[df.pdays != -1].groupby(by=['Target','poutcome'])
print( dfg['pdays'].mean())
print( dfg['pdays'].count())
print('**** number of pdays since last campaign pdays ==-1')
dfg =df[df.pdays == -1].groupby(by=['Target','poutcome'])
print( dfg['pdays'].mean())
print( dfg['pdays'].count())
#Analysis
# previous campaign outcome unknown we were able to convert to Yes by more contacts
# previous campaign success and this campaing success highly corellated
# pdays =-1 are all unknown previous outcome
#replace categotical
replaceStruct = {
#"month": {"jan": 1,"feb": 2,"mar": 3,"apr": 4,"may": 5,"jun": 6,"jul": 7,"aug": 8,"sep": 9,"oct": 10,"nov": 11,"dec": 12 },
"Target": {"yes":1, "no":0},
"housing": {"yes":1, "no":0},
"default": {"yes":1, "no":0},
"loan": {"yes":1, "no":0},
#"education": {"unknown":0, "primary":1, "secondary":2, "tertiary":3},
#"poutcome": {"unknown":-2, "success":1, "failure":0, "other":-1}
}
df=df.replace(replaceStruct)
oneHotCols=['job','marital','education','contact','poutcome','month']
dfdata=pd.get_dummies(df, columns=oneHotCols, drop_first = True)
dfdata.head(3).T
#Prepare test and training data
x = dfdata.drop("Target" , axis=1)
y = dfdata.pop("Target")
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.30, random_state=1)
from sklearn.tree import DecisionTreeClassifier
# function to build DataFrame of results
def build_performance_df(model, modelName , x_test , y_test , x_train , y_train ):
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
d = { 'score_test':model.score(x_test, y_test) , 'score_train':model.score(x_train, y_train)}
d["Recall"]=recall_score(y_test,pred_test) # tp/(tp+fn)
d["Precision"]=precision_score(y_test,pred_test) # tp/(tp+fp)
d["F1 Score"]=f1_score(y_test,pred_test)
d["Roc Auc Score"]=roc_auc_score(y_test,pred_test)
dft = pd.DataFrame(d , index = [modelName])
return dft
feature_cols = x_train.columns
model_entropy = DecisionTreeClassifier(criterion = 'entropy' , random_state = 7 )
model_entropy.fit(x_train, y_train)
#common data record to save performance
df_performace = build_performance_df(model_entropy, 'DT_Entropy' , x_test , y_test , x_train , y_train )
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
import graphviz
#confusion matrix
pd.crosstab(y_test, model_entropy.predict(x_test), rownames=['Actual'], colnames=['Predicted'] , margins = True, normalize = True)
#using different depth and min_sample_leaf to get the best score
for d in [7,10,16,25]:
for s in [ 50,100]:
model = DecisionTreeClassifier(criterion = 'entropy' , max_depth=d, min_samples_leaf=s, random_state = 7 )
model.fit(x_train, y_train)
df_performace = pd.concat( [df_performace , build_performance_df(model, 'DT_'+str(d)+'_'+str(s), x_test , y_test , x_train , y_train )])
df_performace.sort_values(by=['score_test'], inplace=True, ascending=False)
df_performace.head(10)
#Best Decision Tree
# DT_16_100 , DT_25_100 , DT_7_50
# If you look at DT_7_50 its acceptable, but for this exercise I will take DT_16_100
best_dt_model = DecisionTreeClassifier(criterion = 'entropy' , max_depth=16, min_samples_leaf=100, random_state = 7 )
best_dt_model.fit(x_train, y_train)
pred_test= best_dt_model.predict(x_test)
dot_data = StringIO()
export_graphviz(best_dt_model, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('best_dt_model.png')
Image(graph.create_png())
#Print the feature importance of the decision model
feat_importance = best_dt_model.tree_.compute_feature_importances(normalize=False)
feat_imp_dict = dict(zip(feature_cols, best_dt_model.feature_importances_))
feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index')
feat_imp.sort_values(by=0, ascending=False)
## As expected, duration of last contact is always highly predictive of whether they accepted or not
## As people not interested will not entertain a long call
## Previous customers are always important
## We can remove unnecessary features
## But I am skipping this for this exercise
#confusion matrix
pd.crosstab(y_test, pred_test, rownames=['Actual'], colnames=['Predicted'] , margins = True, normalize = True)
#fine tune params and build a display data frame
#using different penalty , solver and C
solver = ['newton-cg','liblinear']
C = [0.1,0.25,0.5,0.75,1]
penalty = ['l1', 'l2']
for s in solver:
for p in penalty:
proceed = True
if p == 'l1':
proceed = s in ['saga', 'liblinear']
if proceed:
for c in C:
model = LogisticRegression(C=c,random_state=7,penalty=p,solver=s)
model.fit(x_train, y_train)
df_performace = pd.concat( [df_performace , build_performance_df(model, 'LR_'+s+'_'+p+'_'+str(c), x_test , y_test , x_train , y_train )])
df_performace.sort_values(by=['score_test'], inplace=True, ascending=False)
df_performace.head(10)
## Best Logistic Regression
## LR_liblinear_l1_0.75
best_lr_model = LogisticRegression(C=0.75,random_state=42,penalty='l1',solver='liblinear')
best_lr_model.fit(x_train, y_train)
pred_test= best_lr_model.predict(x_test)
#Print the feature importance of the decision model
feat_imp_dict = dict(zip(feature_cols, best_lr_model.coef_[0]))
feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index')
feat_imp.sort_values(by=0, ascending=False)
#poutcome_sucess is high - which signifies that existing cutomers who previously said yes are good condidates
#confusion matrix
pd.crosstab(y_test, pred_test, rownames=['Actual'], colnames=['Predicted'] , margins = True, normalize = True)
from sklearn.ensemble import BaggingClassifier
# Bagging with pruned best dt
ests = [ 50, 100,200] # estimators
for e in ests:
model = BaggingClassifier(base_estimator=best_dt_model, max_samples =0.7, n_estimators=e,random_state=7 )
model = model.fit(x_train, y_train)
df_performace = pd.concat( [df_performace , build_performance_df(model, 'BAG_DTBEST_'+'_'+str(e), x_test , y_test , x_train , y_train )])
# Bagging with None dt
for e in ests:
model = BaggingClassifier(base_estimator=None,max_samples =0.7, n_estimators=e,random_state=7 )
model = model.fit(x_train, y_train)
df_performace = pd.concat( [df_performace , build_performance_df(model, 'BAG_None_'+'_'+str(e), x_test , y_test , x_train , y_train )])
df_performace.sort_values(by=['score_test'], inplace=False, ascending=False).head(20)
##Best Bagging without overfit [ I have used max samples of 0.7]
##BAG_DTBEST__100
best_bag_model = BaggingClassifier(base_estimator=best_dt_model, max_samples =0.7, n_estimators=100,random_state=7 )
best_bag_model = best_bag_model.fit(x_train, y_train)
#confusion matrix
pred_test= best_bag_model.predict(x_test)
pd.crosstab(y_test, pred_test, rownames=['Actual'], colnames=['Predicted'] , margins = True, normalize = True)
from sklearn.ensemble import AdaBoostClassifier
# Bagging with pruned best dt
ests = [ 50, 100,200] # estimators
for e in ests:
model = AdaBoostClassifier(n_estimators=e, random_state=7)
model = model.fit(x_train, y_train)
df_performace = pd.concat( [df_performace , build_performance_df(model, 'ADA_None'+'_'+str(e), x_test , y_test , x_train , y_train )])
for e in ests:
model = AdaBoostClassifier(base_estimator = best_dt_model ,n_estimators=e, random_state=7)
model = model.fit(x_train, y_train)
df_performace = pd.concat( [df_performace , build_performance_df(model, 'ADA_DT'+'_'+str(e), x_test , y_test , x_train , y_train )])
df_performace.sort_values(by=['score_test'], inplace=False, ascending=False).head(30)
# Adaboost did not perform as good as Bagging
##Best AdaBoot
##ADA_None_100
best_ada_model = AdaBoostClassifier(n_estimators=100, random_state=1)
best_ada_model = best_ada_model.fit(x_train, y_train)
#confusion matrix
pred_test= best_ada_model.predict(x_test)
pd.crosstab(y_test, pred_test, rownames=['Actual'], colnames=['Predicted'] , margins = True, normalize = True)
from sklearn.ensemble import GradientBoostingClassifier
# GB
ests = [ 50, 100,200] # estimators
depths = [ 3 ,5, 7]
for e in ests:
for d in depths:
model =GradientBoostingClassifier(n_estimators = e,random_state=7, max_depth = d)
model = model.fit(x_train, y_train)
df_performace = pd.concat( [df_performace , build_performance_df(model, 'GBC'+'_'+str(e) +'_' +str(d), x_test , y_test , x_train , y_train )])
df_performace.sort_values(by=['score_test'], inplace=False, ascending=False).head(30)
# Gradient Boosting gives the best model performance
best_gb_model =GradientBoostingClassifier(n_estimators = 200,random_state=7, max_depth = 5)
best_gb_model = best_gb_model.fit(x_train, y_train)
#confusion matrix
pred_test= best_gb_model.predict(x_test)
pd.crosstab(y_test, pred_test, rownames=['Actual'], colnames=['Predicted'] , margins = True, normalize = False)
#conclusion - in the end The Gradient Boosting seems to give the best performance.